## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Loading required package: lattice
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
##
## Attaching package: 'seriation'
## The following object is masked from 'package:lattice':
##
## panel.lines
## Loading required package: pdc
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Create a vector containing all attribute names
attribute.names = c()
for(pixel.number in 1:9) {
for(attribute.suffix in c("Red", "Green", "NIR1", "NIR2")) {
attribute.name = paste("p", pixel.number, ".", attribute.suffix, sep="")
attribute.names = c(attribute.names, attribute.name)
}
}
attribute.names = c(attribute.names, "class")
# Create a vector containing all class names
class.names = c("red soil", "cotton crop", "grey soil", "damp grey soil", "soil with vegetation stubble", "very damp grey soil")
# Load the dataset from the csv file and cast the label to factor
dataset = data.frame(read.csv("dataset.csv", col.names = attribute.names, sep=" "))
dataset$class = factor(dataset$class, labels = class.names)
# Determine the input and the output space
dataset.input = dataset[, 1:ncol(dataset)-1]
dataset.output = dataset$class
# Determine some statistical measures on the input attributes
dataset.mean = sapply(dataset.input, mean)
dataset.sd = sapply(dataset.input, sd)
dataset.var = sapply(dataset.input, var)
dataset.min = sapply(dataset.input, min)
dataset.max = sapply(dataset.input, max)
dataset.median = sapply(dataset.input, median)
dataset.range = sapply(dataset.input, range)
dataset.quantile = sapply(dataset.input, quantile)
# Determine class frequencies
class.frequencies = data.frame(table(dataset.output))
colnames(class.frequencies) = c("class", "frequencies")
class.frequencies$percentages = paste(format((class.frequencies$frequencies / sum(class.frequencies$frequencies)) * 100, digits = 2), "%", sep = "")
ggplot(data = class.frequencies, aes(x = class, y = frequencies, fill = class)) + geom_bar(stat = "identity") + geom_text(aes(label = percentages), vjust = 1.5, colour = "white") + theme(axis.text.x=element_blank())
ggplot(data = class.frequencies, aes(x = "", y = frequencies, fill = class)) + geom_bar(stat = "identity", width = 1, color = "white") + coord_polar("y", start = 0)
## Separabilità I valori relativi allo stesso canale tra pixel adiacenti risultano correlati (come evidente dai grafici), questo perche’ il dataset e’ costruito come una sliding window quadrata 3x3 su una singola immagine satellitare. Ci si aspetta quindi una varianza piuttosto contenuta sui singoli valori e la sopra citata correlazione.
TODO: Aggiungere titoli ai featureplot
featurePlot(dataset.input[, c("p5.Red", "p5.Green", "p5.NIR1")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(dataset.input[, c("p5.Red", "p6.Red", "p4.Red")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(dataset.input[, c("p5.Green", "p6.Green", "p4.Green")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(dataset.input[, c("p5.Green", "p2.Green", "p8.Green")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(dataset.input[, c("p5.Red", "p2.Red", "p8.Red")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(dataset.input[, c("p5.NIR1", "p2.NIR1", "p8.NIR1")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))
Data la correlazione rilevata precedentemente e l’alto numero di attributi in input sembra più che opportuno eseguire una PCA sul dataset al fine di ridurre la dimensione dello spazio di input.
# Execute the PCA. Extract only the first four principal components
pca.results <- PCA(dataset.input, scale.unit = TRUE, ncp = 4, graph = FALSE)
# Get the eigenvalues and plot them. The first four components explain about 92% of variance
pca.results.eig.val <- get_eigenvalue(pca.results)
fviz_eig(pca.results, addlabels = TRUE, ylim = c(0, 50))
# Show the correlation between the variables using the first two principal components
# Many variable are positively correlated
fviz_pca_var(pca.results, col.var = "black")
Analizzando le coordinate per i singoli attributi si riesce a capire i 4 cluster rappresentano i 4 tipi di variabili in gioco (Red, Green NIR1, NIR2)
# Extract the principal components
dataset.input.pca = data.frame(get_pca_ind(pca.results)$coord)
dataset.pca = cbind(class=dataset$class, dataset.input.pca)
# Draw some feature plot to highlight the level of difficulty to recognise the various classes
featurePlot(x=dataset.input.pca, y=dataset.output, plot="density", scales=list(x=list(relation="free"), y=list(relation="free")), auto.key=list(columns=3))
featurePlot(x=dataset.input.pca, y=dataset.output, plot="pairs", auto.key=list(columns=3))
# Divide the dataset into trainset and testset
indexes = sample(2, nrow(dataset.pca), replace = TRUE, prob = c(0.7, 0.3))
temp.trainset = dataset.pca[indexes == 1,]
testset = dataset.pca[indexes == 2,]
testset.x = testset[, !names(testset) %in% c("class")]
testset.y = testset[, c("class")]
# Divide the trainset to obtain a validationset
indexes = sample(2, nrow(temp.trainset), replace = TRUE, prob = c(0.9, 0.1))
trainset = temp.trainset[indexes == 1,]
validationset = temp.trainset[indexes == 2,]
validationset.x = validationset[, !names(validationset) %in% c("class")]
validationset.y = validationset[, c("class")]
# Train a SVM model and tune the hyperparameters (cost and gamma)
svm.tuning.control = tune.control(sampling = "fix") # To use validationset instead
svm.tuning = tune.svm(class ~ Dim.1 + Dim.2 + Dim.3 + Dim.4, data = trainset, kernel = "radial", gamma = 10^(-3:1), cost = 10^(-2:1), validation.x = validationset.x, validation.y = validationset.y, tunecontrol = svm.tuning.control, probability = TRUE)
plot(svm.tuning)
# Determine some metrics for each class
svm.tuned = svm.tuning$best.model
svm.tuned.prediction = predict(svm.tuned, testset.x)
svm.tuned.result = confusionMatrix(svm.tuned.prediction, testset.y, mode = "prec_recall")
# TODO: how is calculated? avg, micro-avg, macro-avg?
sprintf("Tuned SVM global accuracy is %2.1f%%", svm.tuned.result$overall["Accuracy"] * 100)
## [1] "Tuned SVM global accuracy is 88.4%"
print(svm.tuned.result$table)
## Reference
## Prediction red soil cotton crop grey soil damp grey soil
## red soil 460 0 7 1
## cotton crop 0 190 2 1
## grey soil 2 1 369 29
## damp grey soil 1 1 13 108
## soil with vegetation stubble 6 9 0 5
## very damp grey soil 0 2 9 38
## Reference
## Prediction soil with vegetation stubble very damp grey soil
## red soil 11 0
## cotton crop 6 0
## grey soil 3 9
## damp grey soil 1 39
## soil with vegetation stubble 181 10
## very damp grey soil 18 406
# Train a NNET model and tune the hyperparameters (size and decay)
nnet.tuning.control = tune.control(sampling = 'fix')
nnet.tuning = tune.nnet(class ~ Dim.1 + Dim.2 + Dim.3 + Dim.4, data = trainset, size = c(2:4), validation.x = validationset.x, validation.y = validationset.y, decay = c(0, 2^-2:1), probability = TRUE)
plot(nnet.tuning)
# Determine some metrics for each class
nnet.tuned = nnet.tuning$best.model
nnet.tuned.prediction = factor(predict(nnet.tuned, testset.x, type = "class"), levels = class.names)
nnet.tuned.result = confusionMatrix(nnet.tuned.prediction, testset.y, mode = "prec_recall")
# TODO: how is calculated? avg, micro-avg, macro-avg?
sprintf("Tuned NNET global accuracy is %2.1f%%", nnet.tuned.result$overall["Accuracy"] * 100)
## [1] "Tuned NNET global accuracy is 84.9%"
print(nnet.tuned.result$table)
## Reference
## Prediction red soil cotton crop grey soil damp grey soil
## red soil 459 2 6 1
## cotton crop 0 185 1 0
## grey soil 1 0 364 33
## damp grey soil 1 0 26 81
## soil with vegetation stubble 8 16 1 8
## very damp grey soil 0 0 2 59
## Reference
## Prediction soil with vegetation stubble very damp grey soil
## red soil 14 0
## cotton crop 7 0
## grey soil 0 8
## damp grey soil 1 52
## soil with vegetation stubble 168 15
## very damp grey soil 30 389
In questa sezione vengono calcolate e confrontate metriche di base come sensitivity, specificity e F-scores.
metrics_of_interest = c("Precision", "Specificity", "F1")
svm.macro.fscore = Reduce('+', svm.tuned.result$byClass[,"F1"]) / length(class.names)
svm.micro.fscore = Reduce('+', svm.tuned.result$byClass[,"F1"] * svm.tuned.result$byClass[,"Prevalence"])
sprintf("SVM macro F-Score: %1.3f", svm.macro.fscore)
## [1] "SVM macro F-Score: 0.859"
sprintf("SVM micro F-Score: %1.3f", svm.micro.fscore)
## [1] "SVM micro F-Score: 0.883"
print(svm.tuned.result$byClass[, metrics_of_interest])
## Precision Specificity F1
## Class: red soil 0.9603340 0.9870660 0.9704641
## Class: cotton crop 0.9547739 0.9948127 0.9452736
## Class: grey soil 0.8934625 0.9713914 0.9077491
## Class: damp grey soil 0.6625767 0.9686788 0.6260870
## Class: soil with vegetation stubble 0.8578199 0.9825378 0.8399072
## Class: very damp grey soil 0.8583510 0.9545455 0.8665955
# Determine some metrics for each class
nnet.macro.fscore = Reduce('+', nnet.tuned.result$byClass[,"F1"]) / length(class.names)
nnet.micro.fscore = Reduce('+', nnet.tuned.result$byClass[,"F1"] * nnet.tuned.result$byClass[,"Prevalence"])
sprintf("NNET macro F-Score: %1.3f", nnet.macro.fscore)
## [1] "NNET macro F-Score: 0.812"
sprintf("NNET micro F-Score: %1.3f", nnet.micro.fscore)
## [1] "NNET micro F-Score: 0.847"
sprintf("Tuned NNET metrics by class:")
## [1] "Tuned NNET metrics by class:"
nnet.tuned.result$byClass[, metrics_of_interest]
## Precision Specificity F1
## Class: red soil 0.9522822 0.9843431 0.9652997
## Class: cotton crop 0.9585492 0.9953890 0.9343434
## Class: grey soil 0.8965517 0.9726918 0.9032258
## Class: damp grey soil 0.5031056 0.9544419 0.4723032
## Class: soil with vegetation stubble 0.7777778 0.9720605 0.7706422
## Class: very damp grey soil 0.8104167 0.9382632 0.8241525
metrics.comparison = svm.tuned.result$byClass[, metrics_of_interest] - nnet.tuned.result$byClass[, metrics_of_interest]
print("Metrics compared, positive values favor SVM")
## [1] "Metrics compared, positive values favor SVM"
print(metrics.comparison)
## Precision Specificity F1
## Class: red soil 0.008051872 0.0027229408 0.005164450
## Class: cotton crop -0.003775353 -0.0005763689 0.010930197
## Class: grey soil -0.003089254 -0.0013003901 0.004523271
## Class: damp grey soil 0.159471097 0.0142369021 0.153783750
## Class: soil with vegetation stubble 0.080042127 0.0104772992 0.069264991
## Class: very damp grey soil 0.047934285 0.0162822252 0.042442975
# Predicts the most probable class class for each instance
svm.tuned.predictions = predict(svm.tuned, testset.x, probability = TRUE)
# Extracts, for each class, the probability of every instance of being of that class
svm.tuned.predictions.probs = attr(svm.tuned.predictions, "probabilities")
# Extracts, for each class, the probability of every instance of being of that class
nnet.tuned.predictions = predict(nnet.tuned, testset.x, probability = TRUE)
# Creates the dataframe to be used for multiroc
testset.multiroc = data.frame(matrix(ncol=18, nrow=0))
for (i in 1:length(testset.y)) {
row = matrix(0, 1, 6)
# sets 1 to the correct class
row[testset.y[i]] = 1
for (j in 1:6)
# concatenates the probabilities of the svm
row = c(row, svm.tuned.predictions.probs[i, class.names[j]])
# concatenates the probabilities of the neural net
row = c(row, nnet.tuned.predictions[i,])
testset.multiroc = rbind(testset.multiroc, row)
}
# Formats the dataframe to be suitable for the multi_roc function
colnames(testset.multiroc) = c('red_soil_true', 'cotton_crop_true', 'grey_soil_true', 'damp_grey_soil_true', 'vegetation_true', 'very_damp grey_soil_true', 'red_soil_pred_SVM', 'cotton_crop_pred_SVM', 'grey_soil_pred_SVM', 'damp_grey_soil_pred_SVM', 'vegetation_pred_SVM', 'very_damp grey_soil_pred_SVM', 'red_soil_pred_NN', 'cotton_crop_pred_NN', 'grey_soil_pred_NN', 'damp_grey_soil_pred_NN', 'vegetation_pred_NN', 'very_damp grey_soil_pred_NN')
svm.nnet.multiroc = multi_roc(testset.multiroc, force_diag=T)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
print("Per class AUC values:")
## [1] "Per class AUC values:"
print(unlist(svm.nnet.multiroc$AUC))
## SVM.red_soil SVM.cotton_crop SVM.grey_soil
## 0.9980333 0.9970103 0.9896131
## SVM.damp_grey_soil SVM.vegetation SVM.very_damp grey_soil
## 0.9420480 0.9834771 0.9794060
## SVM.macro SVM.micro NN.red_soil
## 0.9815677 0.9878187 0.9971247
## NN.cotton_crop NN.grey_soil NN.damp_grey_soil
## 0.9979614 0.9891157 0.9211714
## NN.vegetation NN.very_damp grey_soil NN.macro
## 0.9747513 0.9726436 0.9754171
## NN.micro
## 0.9846539
# MultiROC plotting
n_method <- length(unique(svm.nnet.multiroc$Methods))
n_group <- length(unique(svm.nnet.multiroc$Groups))
# changes the format of results to a ggplot2 friendly format
for (i in 1:n_group) {
res_df <- data.frame(Specificity= numeric(0), Sensitivity= numeric(0), Group = character(0), AUC = numeric(0), Method = character(0))
for (j in 1:n_method) {
temp_data_1 <- data.frame(Specificity=svm.nnet.multiroc$Specificity[[j]][i],
Sensitivity=svm.nnet.multiroc$Sensitivity[[j]][i],
Group=tools::toTitleCase(gsub("_", " ", unique(svm.nnet.multiroc$Groups)[i])),
AUC=svm.nnet.multiroc$AUC[[j]][i],
Method = unique(svm.nnet.multiroc$Methods)[j])
colnames(temp_data_1) <- c("Specificity", "Sensitivity", "Group", "AUC", "Method")
res_df <- rbind(res_df, temp_data_1)
}
print(ggplot2::ggplot(res_df, ggplot2::aes(x = 1-Specificity, y=Sensitivity)) + ggplot2::geom_path(ggplot2::aes(color = Group, linetype=Method)) + ggplot2::geom_segment(ggplot2::aes(x = 0, y = 0, xend = 1, yend = 1), colour='grey', linetype = 'dotdash') + ggplot2::theme_bw() + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5), legend.justification=c(1, 0), legend.position=c(.95, .05), legend.title=ggplot2::element_blank(), legend.background = ggplot2::element_rect(fill=NULL, size=0.5, linetype="solid", colour ="black")))
}
trainset.clustering = dataset.input.pca
labels = dataset.pca[1]
# dataframe to vector conversion
labels.list <- c()
labels.list = match(as.factor(labels$class), class.names)
set.seed(1)
nk = 2:12
# calculates average silhouette values for an ascending value of k
SW = sapply(nk, function(k){cluster.stats(dist(trainset.clustering),
kmeans(trainset.clustering, centers=k, nstart=5)$cluster)$avg.silwidth})
#SW
plot(nk, SW, type="l", xlab="number of clusters", ylab="average silhouette")
Pur ottenendo una misura minore di Silhouette in confronto a k=3, e’ stato comunque ritenuto opportuno analizzare i risultati ottenuti tramite l’esecuzioe con parametro k=6, che rappresenta il numero delle classi, e di conseguenza porta ad una maggiore misura di similarita’ tra i cluster rilevati da k-means e l’effettiva suddivisione in classi del dataset
fit = kmeans(trainset.clustering, 6, nstart=5)
# calculates silhouette values for k=6
kms = silhouette(fit$cluster, dist(trainset.clustering))
plot(kms, col=1:6, border=NA, main = "Silhouette Plot for k=6")
# Dissimilarity Matrix
dissplot(dist(trainset.clustering), labels=fit$cluster, options=list(main="Kmeans clustering with k=6"))
# Evaluation
sprintf("Similarity with (k=6): %f", cluster.evaluation(labels.list, fit$cluster))
## [1] "Similarity with (k=6): 0.689679"
# visualization
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,2), main="Dimensions: 1 & 2",
geom="point")
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,3), main="Dimensions: 1 & 3",
geom="point")
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,4), main="Dimensions: 1 & 4",
geom="point")
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(2,3), main="Dimensions: 2 & 3",
geom="point")
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(2,4), main="Dimensions: 2 & 4",
geom="point")
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(3,4), main="Dimensions: 3 & 4",
geom="point")
# Confusion Matrix
source("clustering_confusion_matrix.R")
confusion_matrix = clustering_confusion_matrix(fit$cluster, labels.list, 6, 6)
for (i in 1:6) {
pie_title = sprintf("Cluster %d Distribution", i)
pie(confusion_matrix[i,], main=pie_title, labels=class.names, col=rainbow(6))
}
#
for (i in 1:6) {
pie_title = sprintf("Class %s Distribution", class.names[i])
pie(confusion_matrix[,i], main=pie_title, labels=1:6, col=rainbow(6))
}
Nonostante il valore di Silhouette maggiore rispetto a k=6, il parametro k=3 porta ad un peggioramento nella misura di similarita’ tra cluster rilevati e classi del dataset
fit = kmeans(trainset.clustering, 3, nstart=5)
# calculates silhouettes values for k=3
kms = silhouette(fit$cluster, dist(trainset.clustering))
plot(kms, col=1:3, border=NA, main = "Silhouette Plot for k=3")
# Dissimilarity Matrix
dissplot(dist(trainset.clustering), labels=fit$cluster, options=list(main="Kmeans clustering with k=3"))
# Evaluation
sprintf("Similarity with (k=3): %f", cluster.evaluation(labels.list, fit$cluster))
## [1] "Similarity with (k=3): 0.529782"
# visualization
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,2), main="Dimensions: 1 & 2",
geom="point")
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,3), main="Dimensions: 1 & 3",
geom="point")
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,4), main="Dimensions: 1 & 4",
geom="point")
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(2,3), main="Dimensions: 2 & 3",
geom="point")
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(2,4), main="Dimensions: 2 & 4",
geom="point")
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(3,4), main="Dimensions: 3 & 4",
geom="point")
# Confusion Matrix
source("clustering_confusion_matrix.R")
confusion_matrix = clustering_confusion_matrix(fit$cluster, labels.list, 3, 6)
for (i in 1:3) {
pie_title = sprintf("Cluster %d Disribution", i)
pie(confusion_matrix[i,], main=pie_title, labels=class.names, col=rainbow(6))
}
#
for (i in 1:6) {
pie_title = sprintf("Class %s Distribution", class.names[i])
pie(confusion_matrix[,i], main=pie_title, labels=1:6, col=rainbow(3))
}
Evidenzia la misura di similarita’ tra i cluster e le classi del dataset, al variaaare del parametro k Il valore piu’ alto e’ ottenuto con k=6, in linea con le rilevazioni precedenti
similarity = c()
# Calculates the similarity between clusters and targets for an ascending value of k
for (k in 2:6) {
fit = kmeans(trainset.clustering, k, nstart = 5)
eval = cluster.evaluation(labels.list, fit$cluster)
similarity = append(similarity, eval)
}
plot(2:6, similarity, type="b", main = "Similarity plot for ascending K", xlab="K")